Eventdf = read_csv("EventCostBDD.csv", skip = 1)
## Rows: 376 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): Name, Disaster, Disaster Group, Begin Date, End Date, Central Day,...
## dbl (17): Yb, Mb, Db, Ye, Me, De, Total CPI-Adjusted Cost (Millions of Dolla...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Eventdf)
## # A tibble: 6 × 24
##   Name       Disaster `Disaster Group` `Begin Date`    Yb    Mb    Db `End Date`
##   <chr>      <chr>    <chr>            <chr>        <dbl> <dbl> <dbl> <chr>     
## 1 Southern … Flooding SevStorm/Flood   4/10/1980     1980     4    10 4/17/1980 
## 2 Hurricane… Tropica… Tropical Cyclone 8/7/1980      1980     8     7 8/11/1980 
## 3 Central/E… Drought  WF/Drought       6/1/1980      1980     6     1 11/30/1980
## 4 Florida F… Freeze   Winter/Freeze    1/12/1981     1981     1    12 1/14/1981 
## 5 Severe St… Severe … SevStorm/Flood   5/5/1981      1981     5     5 5/10/1981 
## 6 Midwest/S… Winter … Winter/Freeze    1/8/1982      1982     1     8 1/16/1982 
## # ℹ 16 more variables: Ye <dbl>, Me <dbl>, De <dbl>,
## #   `Total CPI-Adjusted Cost (Millions of Dollars)` <dbl>, Deaths <dbl>,
## #   `Durn (days)` <dbl>, `Durn (Weeks)` <dbl>, `Durn (Cal Mos)` <dbl>,
## #   `Central Day` <chr>, `Day of W_yr` <dbl>, `MidPt Mo` <dbl>, W_Yr <dbl>,
## #   W_Week <dbl>, W_Mo <dbl>, SeasNum <dbl>, Season <chr>
  1. (5pts) Using monthly data from EventCostBDD.csv file, analyze seasonality by disaster type. Create at least four plots (learned in class) by disaster group.
disas <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo, SeasNum) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`))

ggplot() +
  geom_line(aes(x = Date, y = W_Mo, color = `Disaster Group`), data = disas) +
  labs(x = "Time", y = "Month Number", title = "Month Number for Different Disasters") +
  theme_classic()

disas1 <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  filter(`Disaster Group` == "SevStorm/Flood")

ggplot(aes(x = Date, y = W_Mo), data = disas1) +
  geom_line() +
  geom_smooth(se = FALSE, span = 0.2, color = "red") +
  geom_smooth(se = FALSE, span = 1, color = "blue") +
  geom_smooth(se = FALSE, span = 0.6, color = "green") +
  labs(x = "Time", y = "Month Number", title = "Month Numbers for SevStorm/Flood") +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot() +
  geom_line(aes(x = Date, y = W_Mo, color = `Disaster Group`, group = SeasNum), data = disas) +
  labs(x = "Time", y = "Month Number", title = "Month Number for Different Disasters Grouped by SeasNum and Disaster Group") +
  theme_classic()

disas2 <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  filter(`Disaster Group` == "Winter/Freeze")

ggplot(aes(x = Date, y = W_Mo), data = disas2) +
  geom_line() +
  geom_smooth(se = FALSE, span = 0.2, color = "red") +
  geom_smooth(se = FALSE, span = 1, color = "blue") +
  geom_smooth(se = FALSE, span = 0.6, color = "green") +
  labs(x = "Time", y = "Month Number", title = "Month Number for Winter/Freeze") +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

disas <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo, SeasNum) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`))

ggplot() +
  geom_line(aes(x = Date, y = SeasNum, color = `Disaster Group`), data = disas) +
  labs(x = "Time", y = "Season Number", title = "Month Number for Different Disasters by Season Number") +
  theme_classic()

disas1 <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo, SeasNum) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  filter(`Disaster Group` == "SevStorm/Flood")

ggplot(aes(x = Date, y = SeasNum), data = disas1) +
  geom_line() +
  geom_smooth(se = FALSE, span = 0.2, color = "red") +
  geom_smooth(se = FALSE, span = 1, color = "blue") +
  geom_smooth(se = FALSE, span = 0.6, color = "green") +
  labs(x = "Time", y = "Month Number", title = "Month Number SevStorm/Flood by Season Number") +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

  1. (5pts) Using seasonal data from EventCostBDD.csv, analyze seasonality by disaster type. Create at least four plots (learned in class) by disaster group.
disas1 <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo, SeasNum) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  filter(`Disaster Group` == "Winter/Freeze")

ggplot(aes(x = Date, y = SeasNum), data = disas1) +
  geom_line() +
  geom_smooth(se = FALSE, span = 0.2, color = "red") +
  geom_smooth(se = FALSE, span = 1, color = "blue") +
  geom_smooth(se = FALSE, span = 0.6, color = "green") +
  labs(x = "Time", y = "Month Number", title = "Month Number Winter/Freeze by Season Number") +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

disas1 <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo, SeasNum) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  filter(`Disaster Group` == "Tropical Cyclone")

ggplot(aes(x = Date, y = SeasNum), data = disas1) +
  geom_line() +
  geom_smooth(se = FALSE, span = 0.2, color = "red") +
  geom_smooth(se = FALSE, span = 1, color = "blue") +
  geom_smooth(se = FALSE, span = 0.6, color = "green") +
  labs(x = "Time", y = "Month Number", title = "Month Number Tropical Cyclone by Season Number") +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

disas1 <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Disaster, Season, `Total CPI-Adjusted Cost (Millions of Dollars)`, W_Mo, SeasNum) %>%
  group_by(`Disaster Group`) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  filter(`Disaster Group` == "WF/Drought")

ggplot(aes(x = Date, y = SeasNum), data = disas1) +
  geom_line() +
  geom_smooth(se = FALSE, span = 0.2, color = "red") +
  geom_smooth(se = FALSE, span = 1, color = "blue") +
  geom_smooth(se = FALSE, span = 0.6, color = "green") +
  labs(x = "Time", y = "Month Number", title = "Month Number WF/Drought by Season Number") +
  theme_classic()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

  1. (6pts) Create the TS plot of the number of annual events by disaster group. Show all 4 TS lines on the same plot, provide a legend and label the plot. Can we say that the number of events increased over time, supporting a debate about climate change?
disas_WF <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Ye) %>%
  filter(`Disaster Group` == "WF/Drought") %>%
  group_by(Ye) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  summarise(Ye, Date = mdy(`Begin Date`),
            number = n()) %>%
  drop_na()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Ye'. You can override using the `.groups`
## argument.
disas_Winter <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Ye) %>%
  filter(`Disaster Group` == "Winter/Freeze") %>%
  group_by(Ye) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  summarise(Ye, Date = mdy(`Begin Date`),
            number = n()) %>%
  drop_na()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Ye'. You can override using the `.groups`
## argument.
disas_Tropical <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Ye) %>%
  filter(`Disaster Group` == "Tropical Cyclone") %>%
  group_by(Ye) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  summarise(Ye, Date = mdy(`Begin Date`),
            number = n()) %>%
  drop_na()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Ye'. You can override using the `.groups`
## argument.
disas_Flood <- Eventdf %>%
  select(`Begin Date`, `Disaster Group`, Ye) %>%
  filter(`Disaster Group` == "SevStorm/Flood") %>%
  group_by(Ye) %>%
  mutate(Date = mdy(`Begin Date`)) %>%
  summarise(Ye, Date = mdy(`Begin Date`),
            number = n()) %>%
  drop_na()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'Ye'. You can override using the `.groups`
## argument.
colors <- c("WF/Drought" = "orange", "Winter/Freeze" = "slategray2", "Tropical Cyclone" = "lightseagreen", "SevStorm/Flood" = "blue")

ggplot() +
  geom_line(aes(x=Date, y=number, color = "WF/Drought"), size = 1, data=disas_WF) +
  geom_line(aes(x=Date, y=number, color = "Winter/Freeze"), size = 1, data=disas_Winter) +
  geom_line(aes(x=Date, y=number, color = "Tropical Cyclone"), size = 1, data=disas_Tropical) +
  geom_line(aes(x=Date, y=number, color = "SevStorm/Flood"), size = 1, data = disas_Flood) +
  labs(x="Time", y = "Number of Disasters Per Annum", title = "The Number of Disasters Per Annum By Disaster Group", color = "Legend") +
  theme_bw() +
  scale_color_manual(values = colors)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

There does appear to be slight changes in climate especially with an increase in flooding and severe storms. There has also been a slight increase in Tropical Cyclones over the past few years, However, Winter/Freeze and Drought have stayed the same.

  1. (6pts) Write a compelling story about your findings. Pretend that you have to publish this story in Miami’s Newsletter. Make sure your story is understood by a non-statistician. Write a minimum of 10 sentences for a full credit

When looking at climate change, natural disasters is a place that people want to see information. Looking at the plot titled The number of disasters Per Annum By Disaster Group, there are a couple of trends that can be seen. First off it can be seen that the number of disasters such as tropical cyclones and Severe Storms/Flooding has increased tremendously over the past few years, which is very different compared to what their numbers were back in the 1980s and 1990s. When looking at the plot labeled Month Numbers for Different Disasters it can be seen that there is seasonality amongst a couple of the disaster groups. For specifically WF/Drought, Winter/Freeze, and Tropical Cyclone’s it can be seen that for these plots, they all stay within one season or just a couple of months. For WF/Drought, it can be seen that this normally happens between June and August. For Winter/Freeze this normally happens between January and March. For Tropical Cyclones, these can happen anywhere between July and November. All of these shows the seasonality, and paired with the plot titled the number of disaster Per Annum by Disaster Group, we could come to the conclusion that there is some example of climate change as a result of disaster types and seasonality, however, seasonality plays a very little affect, as the Disaster group that went through the biggest change, is the only disaster type that did not show seasonality. That disaster group is severe storms, which was shown to be a year round occurrence.

  1. (6pts) Build 4 seasonal means models without an intercept for monthly cost data by disaster group. Write down the equation of these models including all assumptions. Provide the interpretation of the coefficients and discuss the model fit. A min of 5 full sentences is required.

$ _t = _0 +_1 + 2 + … + {11} $

season_wf <- Eventdf %>%
  dplyr::select(Mb, `Disaster Group`, `Total CPI-Adjusted Cost (Millions of Dollars)`) %>%
  filter(`Disaster Group` == "WF/Drought") %>%
  drop_na()

season_wf
## # A tibble: 53 × 3
##       Mb `Disaster Group` `Total CPI-Adjusted Cost (Millions of Dollars)`
##    <dbl> <chr>                                                      <dbl>
##  1     6 WF/Drought                                                39579 
##  2     6 WF/Drought                                                 9307.
##  3     6 WF/Drought                                                 5050.
##  4     6 WF/Drought                                                53010.
##  5     6 WF/Drought                                                 7628.
##  6     6 WF/Drought                                                 1666.
##  7     3 WF/Drought                                                 6859.
##  8    10 WF/Drought                                                 7359 
##  9     6 WF/Drought                                                 2705.
## 10     9 WF/Drought                                                 2908.
## # ℹ 43 more rows
season_wf.ts <- ts(season_wf$`Total CPI-Adjusted Cost (Millions of Dollars)`, start = c(1980, 6), frequency = 12)

month <- season(season_wf.ts)
seasonal_means <- lm(season_wf.ts ~month)
summary(seasonal_means)
## 
## Call:
## lm(formula = season_wf.ts ~ month)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12815  -6028  -2808   2235  38271 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       11899       5722   2.080   0.0439 *
## monthFebruary     -7676       8092  -0.949   0.3484  
## monthMarch          874       8092   0.108   0.9145  
## monthApril         -880       8092  -0.109   0.9139  
## monthMay          -6606       8092  -0.816   0.4190  
## monthJune           290       7676   0.038   0.9700  
## monthJuly         -5804       7676  -0.756   0.4539  
## monthAugust       -4143       7676  -0.540   0.5923  
## monthSeptember     2840       7676   0.370   0.7133  
## monthOctober      -4734       7676  -0.617   0.5409  
## monthNovember     -4028       8092  -0.498   0.6213  
## monthDecember     -1204       8092  -0.149   0.8824  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11440 on 41 degrees of freedom
## Multiple R-squared:  0.09231,    Adjusted R-squared:  -0.1512 
## F-statistic: 0.3791 on 11 and 41 DF,  p-value: 0.957
# Get fitted values
fitted_model <- data.frame(date=time(season_wf.ts), fit=fitted(seasonal_means))

autoplot(season_wf.ts) + 
  labs(y="Cost in Millions") +
  geom_segment(aes(x=date-0.05, xend=date+0.05, y=fit, yend=fit), size=2, color="red", data=fitted_model)

\(\mu_t = \beta_0 +\beta_1 \text{MonthFebraury} + \beta_2 \text{MonthMarch} + ... + \beta_{11} \text{MonthDecember}\)

season_Winter <- Eventdf %>%
  select(Mb, `Disaster Group`, `Total CPI-Adjusted Cost (Millions of Dollars)`) %>%
  filter(`Disaster Group` == "Winter/Freeze") %>%
  drop_na()

season_winter.ts <- ts(season_Winter$`Total CPI-Adjusted Cost (Millions of Dollars)`, start = c(1981, 1), frequency = 12)

month <- season(season_winter.ts)
seasonal_means <- lm(season_winter.ts ~month)
summary(seasonal_means)
## 
## Call:
## lm(formula = season_winter.ts ~ month)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8912.1 -1824.2    58.8  1172.7 15981.5 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)      3958.5     2981.9   1.328    0.200
## monthFebruary   -1931.3     4217.0  -0.458    0.652
## monthMarch        155.4     4217.0   0.037    0.971
## monthApril      -1603.5     4217.0  -0.380    0.708
## monthMay         6603.0     4217.0   1.566    0.134
## monthJune        1062.7     4217.0   0.252    0.804
## monthJuly       -2102.9     4217.0  -0.499    0.624
## monthAugust      1692.0     4714.8   0.359    0.724
## monthSeptember   -696.8     4714.8  -0.148    0.884
## monthOctober     3230.1     4714.8   0.685    0.502
## monthNovember   -2237.2     4714.8  -0.475    0.641
## monthDecember     597.0     4714.8   0.127    0.901
## 
## Residual standard error: 5165 on 19 degrees of freedom
## Multiple R-squared:  0.2832, Adjusted R-squared:  -0.1317 
## F-statistic: 0.6826 on 11 and 19 DF,  p-value: 0.7386
# Get fitted values
fitted_model <- data.frame(date=time(season_winter.ts), fit=fitted(seasonal_means))

autoplot(season_winter.ts) + 
  labs(y="Cost in Millions") +
  geom_segment(aes(x=date-0.05, xend=date+0.05, y=fit, yend=fit), size=2, color="red", data=fitted_model)

\(\mu_t = \beta_0 +\beta_1 \text{Month Febraury} + \beta_2 \text{MonthMarch} + ... + \beta_{11} \text{MonthDecember}\)

season_tropic <- Eventdf %>%
  select(Mb, `Disaster Group`, `Total CPI-Adjusted Cost (Millions of Dollars)`) %>%
  filter(`Disaster Group` == "Tropical Cyclone") %>%
  drop_na()

season_tropic.ts <- ts(season_tropic$`Total CPI-Adjusted Cost (Millions of Dollars)`, start = c(1980, 8), frequency = 12)

month <- season(season_tropic.ts)
seasonal_means <- lm(season_tropic.ts ~month)
summary(seasonal_means)
## 
## Call:
## lm(formula = season_tropic.ts ~ month)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -51244 -19025  -5986   4226 142080 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       30829      17815   1.731   0.0897 .
## monthFebruary    -12031      25194  -0.478   0.6351  
## monthMarch        -4325      25194  -0.172   0.8644  
## monthApril       -16388      25194  -0.650   0.5184  
## monthMay         -18971      25194  -0.753   0.4550  
## monthJune         -1416      25194  -0.056   0.9554  
## monthJuly        -23292      25194  -0.925   0.3597  
## monthAugust      -21226      24122  -0.880   0.3831  
## monthSeptember    -5771      24122  -0.239   0.8119  
## monthOctober     -22494      25194  -0.893   0.3762  
## monthNovember      2731      25194   0.108   0.9141  
## monthDecember     22138      25194   0.879   0.3838  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39840 on 50 degrees of freedom
## Multiple R-squared:  0.1123, Adjusted R-squared:  -0.08304 
## F-statistic: 0.5748 on 11 and 50 DF,  p-value: 0.8399
# Get fitted values
fitted_model <- data.frame(date=time(season_tropic.ts), fit=fitted(seasonal_means))

autoplot(season_tropic.ts) + 
  labs(y="Cost in Millions") +
  geom_segment(aes(x=date-0.05, xend=date+0.05, y=fit, yend=fit), size=2, color="red", data=fitted_model)

\(\mu_t = \beta_0 +\beta_1 \text{Month Febraury} + \beta_2 \text{MonthMarch} + ... + \beta_{11} \text{MonthDecember}\)

season_flood <- Eventdf %>%
  select(Mb, `Disaster Group`, `Total CPI-Adjusted Cost (Millions of Dollars)`) %>%
  filter(`Disaster Group` == "SevStorm/Flood") %>%
  drop_na()

season_flood.ts <- ts(season_flood$`Total CPI-Adjusted Cost (Millions of Dollars)`, start = c(1980, 4), frequency = 12)

month <- season(season_flood.ts)
seasonal_means <- lm(season_flood.ts ~month)
summary(seasonal_means)
## 
## Call:
## lm(formula = season_flood.ts ~ month)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3644  -1274   -759    169  40212 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      1979.6      823.3   2.404   0.0170 *
## monthFebruary    2874.2     1164.3   2.469   0.0143 *
## monthMarch        928.3     1164.3   0.797   0.4261  
## monthApril        335.7     1149.7   0.292   0.7705  
## monthMay          987.3     1149.7   0.859   0.3914  
## monthJune         437.3     1164.3   0.376   0.7076  
## monthJuly        1107.6     1164.3   0.951   0.3425  
## monthAugust       925.0     1164.3   0.794   0.4278  
## monthSeptember   1052.6     1164.3   0.904   0.3670  
## monthOctober      947.3     1164.3   0.814   0.4168  
## monthNovember     442.1     1164.3   0.380   0.7045  
## monthDecember     200.2     1164.3   0.172   0.8636  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3589 on 218 degrees of freedom
## Multiple R-squared:  0.03906,    Adjusted R-squared:  -0.009432 
## F-statistic: 0.8055 on 11 and 218 DF,  p-value: 0.6346
# Get fitted values
fitted_model <- data.frame(date=time(season_flood.ts), fit=fitted(seasonal_means))

autoplot(season_flood.ts) + 
  labs(y="Cost in Millions") +
  geom_segment(aes(x=date-0.05, xend=date+0.05, y=fit, yend=fit), size=2, color="red", data=fitted_model)

For the plots above, there are 4 different season means models, which have been split up by group type. The first model, represents the TS with only WF/Drought as the disaster type, looking at the visualization of this TS, it can be seen that this model does not predict the data very well. A lot of the points for this plot are not close to the lines shown below. For the 2nd model it depicts the data when Winter/Freeze is the disaster type shown. This plot does a better job of representing the data, however there are three boxes which are not close to any of the trend lines. For the the third plot, it depicts the model where Tropical Cyclones is the disaster type. This model again depicts the data very well except for where the trend line gets very high. The last model is the model where SevStorm/Flood is the disaster type. This model is predicted the best by the lm model. This makes sense as this data type does not have a seasonal relationship quite like the others and this model is very good.

  1. (6pts) Fit a sinusoidal function through the data by applying sine and cosine trends. Discuss your findings. State your observations. A min of 5 full sentences is required

$ _t = _0 +_1 (2 f t) + _2 (2 f t) $

season_cos <- Eventdf %>%
  select(Mb, `Disaster Group`, `Total CPI-Adjusted Cost (Millions of Dollars)`) %>%
  drop_na()

season_cos.ts <- ts(season_cos$`Total CPI-Adjusted Cost (Millions of Dollars)`, start = c(1980, 4), frequency = 12)

t <- time(season_cos.ts)
# Compute the predictors
X1 <- cos(2*pi*t) 
X2 <- sin(2*pi*t) 

# Fit the model
cosine_model <- lm(season_cos.ts ~ X1 + X2)
summary(cosine_model)
## 
## Call:
## lm(formula = season_cos.ts ~ X1 + X2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8255  -5799  -3648  -1642 186078 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7056.8      912.6   7.733 9.88e-14 ***
## X1            -263.6     1290.5  -0.204   0.8383    
## X2            2359.7     1290.5   1.828   0.0683 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17690 on 373 degrees of freedom
## Multiple R-squared:  0.009003,   Adjusted R-squared:  0.003689 
## F-statistic: 1.694 on 2 and 373 DF,  p-value: 0.1851
a <- seq(1980, 2012, by=1/12)[-1]
beta0 <- cosine_model$coefficients[1]
beta1 <- cosine_model$coefficients[2]
beta2 <- cosine_model$coefficients[3]
fitted_cosine <- beta0 + beta1*cos(2*pi*a) + beta2*sin(2*pi*a) 

cosine_fit <- data.frame(x=a, y=fitted_cosine)

autoplot(season_cos.ts) + 
  labs(y="Total CPI Adjusted Cost (in Millions)") +
  geom_line(aes(x=x, y=y), color="red", data=cosine_fit)

The sinusoidal model depicted on the graph above, does an okay job at representing the data. This TS data includes all of the disaster types and is not split for this data. The Sinuosoidal model looks very good for the data, where the disasters are more, normal, however, if a year has a high value for the total cost, this model becomes a very poor representation of the data. looking at specifically the year 2003 the disaster relief cost is around $150,000 however, the trend line fit by the sinusoidal model is barely above 0. This model struggles compared to the seasonal means models above as a result of it being bounded by axis.

  1. (6pts) Select one ACI Region of your choice in ExposureS_PopHo.xlsx and perform EDA for population and housing units. Write a min 5 sentences with your observations.
PopHo <- read_excel("ExposureS_PopHo.xlsx")
PopHo
## # A tibble: 2,250 × 7
##    `ACI Region` `ACI Region Name` Subregions `W_Yr-S` MidSeason             Popn
##    <chr>        <chr>             <chr>      <chr>    <dttm>               <dbl>
##  1 ALA          Alaska            AK         1961-1   1961-01-15 00:00:00 229000
##  2 ALA          Alaska            AK         1961-2   1961-04-15 00:00:00 227000
##  3 ALA          Alaska            AK         1961-3   1961-07-15 00:00:00 240000
##  4 ALA          Alaska            AK         1961-4   1961-10-15 00:00:00 238000
##  5 ALA          Alaska            AK         1962-1   1962-01-15 00:00:00 236000
##  6 ALA          Alaska            AK         1962-2   1962-04-15 00:00:00 235000
##  7 ALA          Alaska            AK         1962-3   1962-07-15 00:00:00 248000
##  8 ALA          Alaska            AK         1962-4   1962-10-15 00:00:00 246000
##  9 ALA          Alaska            AK         1963-1   1963-01-15 00:00:00 244000
## 10 ALA          Alaska            AK         1963-2   1963-04-15 00:00:00 242000
## # ℹ 2,240 more rows
## # ℹ 1 more variable: HUnits <dbl>
GPL_data <- PopHo %>%
  filter(`ACI Region` == "GPL") %>%
  select(`W_Yr-S`, Popn, HUnits) %>%
  mutate(UnitsPerPerson = HUnits/Popn) %>%
  drop_na()

GPL_ts <- ts(GPL_data$UnitsPerPerson, start = c(1961, 1), frequency = 4)


autoplot(GPL_ts) + labs(y = "Housing Units Per Person")

ggAcf(GPL_ts)

ggPacf(GPL_ts)

ggseasonplot(GPL_ts)

qqnorm(GPL_ts)
qqline(GPL_ts, col = "red")

autoplot(stl(GPL_ts, s.window = "periodic"))

Starting with the TS Plot above, there a couple of trends from this data. The first appeared trend is that this TS does not have a constant mean, this can be seen as there is a drastic increase in mean as the time increase. This would result in this TS not being stationary. In looking at the ggAcf and the ggPacf we can see that there are no trends of either white noise or a random walk. Looking at specifically the ggPacf, we see a graph that looks to be more like a moving average plot. When looking at the ggseasonplot we can definite trends of seasonality, especialy as that data gains more time. In looking at the split plot above, we can see that the remainder appears relatively normal until the data gets to be around 2020, when there is a major increase. Checking assumptions with the normal QQ plot, we can see that the plot fails the fat pencil test and the data is not normal.

  1. (6pts) Analyze the population and housing units in ExposureS_PopHo.xlsx using Holt-Winters models and propose the best model. Discuss the performance of this model and include the visual plots learned in class.
GPL_train <- window(GPL_ts, end = c(2021, 4))
GPL_test <- window(GPL_ts, start = c(2022, 1), end = c(2022, 2))
GPL_test
##           Qtr1      Qtr2
## 2022 0.4570064 0.4563333

In the above code we make our test model the first 2 quarters of 2022.

## Creating different Models

GPL_exp <- HoltWinters(GPL_train, beta=FALSE, gamma=FALSE) # Simple Exponential Smoothing
GPL_trend <- HoltWinters(GPL_train, gamma=FALSE)           # Holt Model (Holt Winters, trend only)
GPL_season <- HoltWinters(GPL_train, beta=FALSE)           # forcing trend coefficient to zero
GPL_hw <- HoltWinters(GPL_train)                          # Hold Winters (trend & Season)
## Forcasting the models into the future
GPL.exp <- forecast(GPL_exp, h=2)
GPL.trend <- forecast(GPL_trend, h=length(GPL_test))
GPL.season <- forecast(GPL_season, h=length(GPL_test))
GPL.hw <- forecast(GPL_hw, h=length(GPL_test))
## PI=FALSE says NOT to plot Prediction intervals
autoplot(window(GPL_train, start=c(1961,1))) + 
  autolayer(GPL.exp, PI=FALSE, series="Exp Smoothing") +
  autolayer(GPL.trend, PI=FALSE, series="HW Trend Only") +
  autolayer(GPL.hw, PI=FALSE, series="Holt Winters") +
  autolayer(GPL.season, PI=FALSE, series="HW Season Only") + 
  autolayer(GPL_test, series="True Housing Per Person", size=0.5) +
  labs(x="Year", y="Great Plain Lakes, Housing Units Per Person") +
  theme_bw()

Looking at the plot above, it can be seen that all but 1 of the models appears to predict the next quarter Housing Per Person number very well. The one model that appears to struggle in this department is the HW trend only model, which has a starting well lower than the rest.

autoplot(window(GPL_train, start=c(1961,1))) + 
  autolayer(GPL.season, series="HW Season Only") + 
  autolayer(GPL_test, series="True Housing Per Person", size=0.5) +
  labs(x="Year", y="Great Plain Lakes, Housing Units Per Person") +
  theme_bw()

Looking at the season Only vs True Housing Per person, we can see that both models appear to predict the data really well, however looking at the darker lines on the inside, it would appear that both very slightly miss the starting points.

accuracy(GPL.exp, GPL_test)
##                         ME         RMSE          MAE        MPE      MAPE
## Training set  0.0008235133 0.0027276475 0.0015906402  0.2075424 0.3931337
## Test set     -0.0006795475 0.0007583207 0.0006795475 -0.1488594 0.1488594
##                   MASE        ACF1 Theil's U
## Training set 0.6118987 -0.05141148        NA
## Test set     0.2614131 -0.50000000  1.509581
accuracy(GPL.trend, GPL_test)
##                        ME        RMSE         MAE        MPE      MAPE
## Training set 5.949962e-07 0.002406355 0.001705079 0.00430616 0.4210345
## Test set     3.186708e-03 0.003187287 0.003186708 0.69780504 0.6978050
##                   MASE        ACF1 Theil's U
## Training set 0.6559217  0.06448426        NA
## Test set     1.2258853 -0.50000000  4.644161
accuracy(GPL.season, GPL_test)
##                        ME        RMSE         MAE        MPE      MAPE
## Training set  0.000653508 0.001937809 0.001252501  0.1658503 0.3019220
## Test set     -0.002014660 0.002033476 0.002014660 -0.4412082 0.4412082
##                   MASE        ACF1 Theil's U
## Training set 0.4818208  0.08713236        NA
## Test set     0.7750137 -0.50000000  3.403139
accuracy(GPL.hw, GPL_test)
##                         ME         RMSE          MAE          MPE      MAPE
## Training set -4.605059e-05 0.0017164144 0.0008790562 -0.007680859 0.2066577
## Test set      7.015981e-04 0.0008515571 0.0007015981  0.153555769 0.1535558
##                   MASE       ACF1 Theil's U
## Training set 0.3381615  0.2438169        NA
## Test set     0.2698957 -0.5000000 0.3253477
tab <- rbind(accuracy(GPL.exp, GPL_test)[2,2:3],
             accuracy(GPL.trend, GPL_test)[2,2:3],
             accuracy(GPL.season, GPL_test)[2,2:3],
             accuracy(GPL.hw, GPL_test)[2,2:3] )
rownames(tab) <- c("Exp Smoothing", "Holt (trend)", "HW (Season only)", "Holt Winters")
kable(tab)
RMSE MAE
Exp Smoothing 0.0007583 0.0006795
Holt (trend) 0.0031873 0.0031867
HW (Season only) 0.0020335 0.0020147
Holt Winters 0.0008516 0.0007016

Looking at the table output above, we can see that every model has a very good accuracy for predicting the data, ultimately, the Holt-Winters Exponential model has lowest RMSE, so this will be the best model for predicting the data.

GPL_exp.full <- HoltWinters(GPL_ts, beta=FALSE, gamma=FALSE) # Holt winters model, exp model
GPL_exp.forecast <- forecast(GPL_exp.full, h=2)
autoplot(window(GPL_ts, start=c(1961,1))) + 
  autolayer(GPL_exp.forecast) + 
  labs(x="Year", y="Great Plain Lakes, Housing Units Per Person") +
  theme_bw()

GPL_exp.forecast
##         Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## 2022 Q3      0.4566276 0.4532982 0.4599571 0.4515357 0.4617196
## 2022 Q4      0.4566276 0.4526896 0.4605656 0.4506050 0.4626503

This model performs extremely well, as it has a very good accuracy rate. When looking at the plot, although it is small hard to see the line looks to accurately predict where I would expect the Housing per person number to go.

  1. (4pts) Reflect on the in-class and take-home portion of the exam (a min 5 full sentences are required).

When reflecting on the in-class portion of the exam, I felt that it wasn’t incredibly difficult, there were a couple of questions I wasn’t expecting to see, however, there was not anything that I felt unfairly represented the course material we have learned. The hardest part of the exam was the time management, I found myself with 2 problems left, when there was 8 minutes left in the exam, I was able to finish, however, it was not my best work as I was rushing to finish. For the take-home portion of the exam, it was a lot more difficult than I was expecting. The data wrangling required for the first few problems was really difficult to work with, and understanding what you were asking for with the data also felt very difficult. The take home exam, also took me a little over 8 hours to complete, which is not something that I was expecting. I was thinking that this part of the exam would be easier going into our first exam, and as I am finishing the take home part now, I feel as though I was completely wrong. I felt that I understood all the code we went over in class, however, when it came to setting up to data to be analyzed I have struggled.